import numpy as np
import pandas as pd
from functools import partial
from IPython.display import HTML, display
from colorcet import fire
import datashader as ds
import dask.dataframe as dd
from bokeh.io import output_notebook
from bokeh.plotting import figure, output_notebook, show
from datashader.bokeh_ext import InteractiveImage
from datashader import transfer_functions as tf
from datashader.utils import export_image
from datashader.colors import colormap_select, Greys9, Hot, viridis, inferno
import geoviews as gv
import cartopy.crs as ccrs
import holoviews as hv
from holoviews.operation.datashader import aggregate, datashade
import warnings
warnings.filterwarnings('ignore')
Using motor vehicle collission data from https://data.cityofnewyork.us/Public-Safety/NYPD-Motor-Vehicle-Collisions/h9gi-nx95
This is a breakdown of every collision in NYC by location and injury. This data is collected because the NYC Council passed Local Law #11 in 2011. This data is manually run every month and reviewed by the TrafficStat Unit before being posted on the NYPD website. Each record represents a collision in NYC by city, borough, precinct and cross street. This data can be used by the public to see how dangerous/safe intersections are in NYC.
df = pd.read_csv('data/NYPD_Motor_Vehicle_Collisions.csv')
df = df[df.LATITUDE > 40.5011072]
df = df[df.LATITUDE < 40.9122231]
df = df[df.LONGITUDE > -74.2482769]
df = df[df.LONGITUDE < -73.700584]
df.head()
df.columns
df[['NUMBER OF PERSONS INJURED', 'NUMBER OF PERSONS KILLED',
'NUMBER OF PEDESTRIANS INJURED', 'NUMBER OF PEDESTRIANS KILLED',
'NUMBER OF CYCLIST INJURED', 'NUMBER OF CYCLIST KILLED',
'NUMBER OF MOTORIST INJURED', 'NUMBER OF MOTORIST KILLED']].sum()
agg = ds.Canvas(750,750).points(df, 'LONGITUDE','LATITUDE')
tf.set_background(tf.shade(agg, cmap=fire),"black")
def df_factor(factor):
return df[(df['CONTRIBUTING FACTOR VEHICLE 1']==factor) | \
(df['CONTRIBUTING FACTOR VEHICLE 2']==factor) | \
(df['CONTRIBUTING FACTOR VEHICLE 3']==factor) | \
(df['CONTRIBUTING FACTOR VEHICLE 4']==factor) | \
(df['CONTRIBUTING FACTOR VEHICLE 5']==factor) ]
def df_vehicle(vehicle_type):
return df[(df['VEHICLE TYPE CODE 1']==vehicle_type) | \
(df['VEHICLE TYPE CODE 2']==vehicle_type) | \
(df['VEHICLE TYPE CODE 3']==vehicle_type) | \
(df['VEHICLE TYPE CODE 4']==vehicle_type) | \
(df['VEHICLE TYPE CODE 5']==vehicle_type) ]
contributing_factors = set()
for column in df.columns:
if column.startswith('CONTRIBUTING FACTOR'):
contributing_factors.update(list(df[column].unique()))
contributing_factors
contributing_factors.remove(np.nan)
for factor in contributing_factors:
factor_df = df_factor(factor)
print('{} : {} incidents,'.format(factor, len(factor_df)))
agg = ds.Canvas(750,750).points(factor_df, 'LONGITUDE','LATITUDE')
display(tf.set_background(tf.shade(agg, cmap=fire),"black"))
df.loc[:, 'lon'], df.loc[:, 'lat'] = ds.utils.lnglat_to_meters(df.LONGITUDE,df.LATITUDE)
hv.extension('bokeh')
url = 'https://server.arcgisonline.com/arcgis/rest/services/Canvas/World_Dark_Gray_Base/MapServer/tile/{z}/{y}/{x}'
tile_opts = dict(width=1000,height=600,xaxis=None,yaxis=None,bgcolor='black',show_grid=False)
map_tiles = gv.WMTS(url).opts(style=dict(alpha=0.5), plot=tile_opts)
points = hv.Points(df, ['lon', 'lat'])
taxi_trips = datashade(points, x_sampling=1, y_sampling=1, cmap=fire, width=1000, height=600)
map_tiles * taxi_trips
df['hour'] = pd.to_datetime(df['TIME'], format='%H:%M').dt.hour
df['hourcat'] = df['hour'].astype("category")
NYC = x_range, y_range = ((-8242000,-8210000), (4965000,4990000))
plot_width = int(750)
plot_height = int(plot_width//1.2)
background = "black"
export = partial(export_image, export_path="export", background=background)
cm = partial(colormap_select, reverse=(background=="black"))
def base_plot(tools='pan,wheel_zoom,reset',plot_width=plot_width, plot_height=plot_height, **plot_args):
p = figure(tools=tools, plot_width=plot_width, plot_height=plot_height,
x_range=x_range, y_range=y_range, outline_line_color=None,
min_border=0, min_border_left=0, min_border_right=0,
min_border_top=0, min_border_bottom=0, **plot_args)
p.axis.visible = False
p.xgrid.grid_line_color = None
p.ygrid.grid_line_color = None
return p
options = dict(line_color=None, fill_color='blue', size=5)
from bokeh.models import WMTSTileSource
colors = ["#FF0000","#FF3F00","#FF7F00","#FFBF00","#FFFF00","#BFFF00","#7FFF00","#3FFF00",
"#00FF00","#00FF3F","#00FF7F","#00FFBF","#00FFFF","#00BFFF","#007FFF","#003FFF",
"#0000FF","#3F00FF","#7F00FF","#BF00FF","#FF00FF","#FF00BF","#FF007F","#FF003F",]
def colorized_images(x_range, y_range, w=plot_width, h=plot_height):
cvs = ds.Canvas(plot_width=w, plot_height=h, x_range=x_range, y_range=y_range)
agg = cvs.points(df, 'lon', 'lat', ds.count_cat('hourcat'))
img = tf.shade(agg, color_key=colors)
return tf.dynspread(img, threshold=0.3, max_px=4)
p = base_plot(background_fill_color=background)
#p.add_tile(WMTSTileSource(url='https://server.arcgisonline.com/arcgis/rest/services/Canvas/World_Dark_Gray_Base/MapServer/tile/{z}/{y}/{x}'))
export(colorized_images(*NYC),"NYC_collission_times")
InteractiveImage(p, colorized_images)
Here the order of colors is roughly red (midnight), yellow (4am), green (8am), cyan (noon), blue (4pm), purple (8pm), and back to red (since hours and colors are both cyclic).
Where do more bike collisions occur(red) than those involving Taxis (blue)?
dftaxi = df_vehicle('TAXI')
dfbike = df_vehicle('BICYCLE')
def merged_images(x_range, y_range, w=plot_width, h=plot_height, how='log'):
cvs = ds.Canvas(plot_width=w, plot_height=h, x_range=x_range, y_range=y_range)
taxi = cvs.points(dftaxi, 'lon', 'lat')
bike = cvs.points(dfbike, 'lon', 'lat')
more_drops = tf.shade(taxi.where(taxi > bike), cmap=["darkblue", 'cornflowerblue'], how=how)
more_picks = tf.shade(bike.where(bike > taxi), cmap=["darkred", 'orangered'], how=how)
img = tf.stack(more_picks,more_drops)
return tf.dynspread(img, threshold=0.3, max_px=4)
p = base_plot(background_fill_color=background)
export(merged_images(*NYC),"NYC_bikes_vs_taxis")
InteractiveImage(p, merged_images)
basemap = WMTSTileSource(url='https://server.arcgisonline.com/arcgis/rest/services/Canvas/World_Light_Gray_Base/MapServer/tile/{z}/{y}/{x}')
def create_image_df(x_range, y_range, w=plot_width, h=plot_height):
cvs = ds.Canvas(plot_width=w, plot_height=h, x_range=x_range, y_range=y_range)
agg = cvs.points(dfx, 'lon', 'lat')
img = tf.shade(agg.where(agg>np.percentile(agg,90)), cmap=inferno, how='eq_hist')
return tf.dynspread(img, threshold=0.3, max_px=4)
dfx = dftaxi
p = base_plot()
p.add_tile(basemap)
export(create_image_df(*NYC),"NYC_Collissions")
InteractiveImage(p, create_image_df)
%matplotlib inline
df['hour'].value_counts().sort_index().plot(kind='bar', color='k')
df['DATE'] = pd.to_datetime(df['DATE'])
df['day_week'] = df.DATE.dt.dayofweek
day_week_labels = ['Mon', 'Tues', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
day_week_hr_labels = [day_week_labels[day] + '-' + str(hour)
for day in range(0,7)
for hour in range(0,25)]
df['day_week_hour'] = df.day_week*25 + df.hour
import matplotlib.pyplot as plt
fig = plt.figure()
fig.set_size_inches(50,6)
fig.add_subplot(111)
ax_day_week_hr = df['day_week_hour'].value_counts().sort_index().plot(kind='bar', color='k')
ax_day_week_hr.set_xticklabels(day_week_hr_labels);
df_alcohol = df_factor('Alcohol Involvement')
Alcohol related collisions increase markedly on Friday and Saturday evenings.
fig = plt.figure()
fig.set_size_inches(50,6)
fig.add_subplot(111)
ax_day_week_hr = df_alcohol['day_week_hour'].value_counts().sort_index().plot(kind='bar', color='k')
ax_day_week_hr.set_xticklabels(day_week_hr_labels);
edestrian injuries peak during early mornings and evenings:
fig = plt.figure()
fig.set_size_inches(50,6)
fig.add_subplot(111)
ax_day_week_hr = df[df['NUMBER OF PEDESTRIANS INJURED']> 0]['day_week_hour'].value_counts().sort_index().plot(kind='bar', color='k')
ax_day_week_hr.set_xticklabels(day_week_hr_labels);
df['dayofyear'] = df.DATE.dt.dayofyear
df['year'] = df.DATE.dt.year
df['month'] = df.DATE.dt.month
df['day'] = df.DATE.dt.day
df.to_pickle('df.pkl')
dftime = df.set_index('DATE')
import calmap
dftime['incident']= 1
calmap.calendarplot(dftime['incident'], linewidth=0, cmap='OrRd', fig_kws=dict(figsize=(18, 14)));
df_alcohol = df_factor('Alcohol Involvement')
df_alcohol['incident'] = 1
calmap.calendarplot(df_alcohol.set_index('DATE')['incident'], linewidth=0, cmap='OrRd', fig_kws=dict(figsize=(18, 14)));
As is evident from the calendar hotspots above, accidents tend to go down during the weekends in general, but those involving drunk driving increase during the weekends. Additionally, there are hotspots on certain days of the year, such as New Years day, last Monday of May (Memorial day), July 4th (Independence day),first Monday in September(Labor day), fourth Thursday of November (Thanksgiving day), and Christmas eve. We also see missing / mislabeled data for several months in 2016 and so it cannot be included as part of our predictive model.
streets = df.groupby('ON STREET NAME')['ON STREET NAME']
count = streets.count()
count.sort_values(ascending=False, inplace=True)
print(count[:20])